#include <stdio.h>
#include <stdlib.h>     /* rand, srand */
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <time.h>       /* time() */

/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
#define UNALIGNED(X, Y)    (((long)X & (sizeof(long) - 1)) | ((long)Y & (sizeof(long) - 1)))

/* How many bytes are copied each iteration of the 4X unrolled loop.  */
#define BIGBLOCKSIZE    (sizeof(long) << 2)

/* How many bytes are copied each iteration of the word copy loop.  */
#define LITTLEBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the byte copier.  */
#define TOO_SMALL(LEN)  ((LEN) < BIGBLOCKSIZE)

char *memcopy_super(char* dest0, const char *src0, size_t len0)
{
   assert(dest0 && src0 && (len0 > 0));

   char *dest = dest0;
   const char *src = src0;
   long *aligned_dest;
   const long *aligned_src;
  
   /* If the size is small, or either SRC or DST is unaligned,
      then punt into the byte copy loop.  This should be rare.  */
    if (!TOO_SMALL(len0) && !UNALIGNED(src, dest)) {
        aligned_dest = (long *)dest;
        aligned_src = (long *)src;
       
        /* Copy 4X long words at a time if possible.  */
        while (len0 >= BIGBLOCKSIZE) {
            *aligned_dest++ = *aligned_src++;
            *aligned_dest++ = *aligned_src++;
            *aligned_dest++ = *aligned_src++;
            *aligned_dest++ = *aligned_src++;
            len0 -= BIGBLOCKSIZE;
        }
       
        /* Copy one long word at a time if possible.  */
        while (len0 >= LITTLEBLOCKSIZE) {
            *aligned_dest++ = *aligned_src++;
            len0 -= LITTLEBLOCKSIZE;
        }

        /* Pick up any residual with a byte copier.  */
        dest = (char *)aligned_dest;
        src = (char *)aligned_src;
    }
   
    while (len0--)
       *dest++ = *src++;
   
    return dest0;
}

static void get_rand_bytes(unsigned char *data, int len)
{
    int i;

    srand((unsigned)time(NULL)); //种下随机种子
    for (i = 0; i < len; i++) {
        data[i] = rand() % 255; //取随机数，并保证数在0-255之间
        //printf("%02X ", data[i]);
    }  
}

static int get_cur_time_us(void)
{
    struct timeval tv;

    gettimeofday(&tv, NULL);  //使用gettimeofday获取当前系统时间

    return (tv.tv_sec * 1000 * 1000 + tv.tv_usec); //利用struct timeval结构体将时间转换为ms
}

#define ARRAY_SIZE(n)  sizeof(n) / sizeof(n[0])

int main(void)
{
   int size_list[] = {
       1024 * 1024 * 10,  // 10MB
       1024 * 1024 * 1,  // 1MB
       1024 * 100, // 100KB
       1024 * 10, // 10KB
       1024 * 1, // 1KB
   };
   char *data1;
   char *data2;
   int t1;
   int t2;
   int i = 0;
  
   data1 = (char *)malloc(size_list[0]);
   data2 = (char *)malloc(size_list[0]);
  
   get_rand_bytes((unsigned char *)data1, size_list[0]);
  
   for (i = 0; i < ARRAY_SIZE(size_list); i++) {
       t1 = get_cur_time_us();
       memcpy(data2, data1, size_list[i]);
       t2 = get_cur_time_us();
       printf("copy %d bytes, memcpy   waste time %dus\n", size_list[i], t2 - t1);
      
       t1 = get_cur_time_us();
       memcopy_super(data2, data1, size_list[i]);
       t2 = get_cur_time_us();
       printf("copy %d bytes, memcopy_super  waste time %dus\n\n", size_list[i], t2 - t1);
   }
  
   free(data1);
   free(data2);
  
   return 0;
}

#if 0

copy 10485760 bytes, memcpy   waste time 6502us
copy 10485760 bytes, memcopy_super  waste time 12689us

copy 1048576 bytes, memcpy   waste time 659us
copy 1048576 bytes, memcopy_super  waste time 999us

copy 102400 bytes, memcpy   waste time 76us
copy 102400 bytes, memcopy_super  waste time 73us

copy 10240 bytes, memcpy   waste time 1us
copy 10240 bytes, memcopy_super  waste time 5us

copy 1024 bytes, memcpy   waste time 1us
copy 1024 bytes, memcopy_super  waste time 1us


#endif
